# Importing libraries
import pandas as pd
import matplotlib.pyplot as plt

# Load your dataset (original file remains unchanged)
Top_50_US_Merged_Data = pd.read_csv('Top_50_US_Merged_2024_10_29_to_2024_11_26.csv')
Top_50_Mexico_Merged_Data = pd.read_csv('Top_50_Mexico_Merged_2024_10_29_to_2024_11_26.csv')

# Convert ISO_Date to datetime
Top_50_US_Merged_Data['ISO_Date'] = pd.to_datetime(Top_50_US_Merged_Data['ISO_Date'])

# # Identify non-numeric columns
# non_numeric_columns = Top_50_US_Merged_Data.select_dtypes(exclude=['number']).columns
# print("Non-numeric columns:", non_numeric_columns)

# Group by date and calculate mean values for numerical attributes
numeric_columns = Top_50_US_Merged_Data.select_dtypes(include=['number']).columns
grouped_data = Top_50_US_Merged_Data.groupby('ISO_Date')[numeric_columns].mean()

# Plot Danceability, Energy, and Valence
plt.figure(figsize=(12, 6))
plt.plot(grouped_data.index, grouped_data['Danceability'], marker='o', label='Danceability', linestyle='-')
plt.plot(grouped_data.index, grouped_data['Energy'], marker='s', label='Energy', linestyle='-')
plt.plot(grouped_data.index, grouped_data['Valence'], marker='^', label='Valence', linestyle='-')
plt.title('Trends in Danceability, Energy, and Valence Over Time')
plt.xlabel('Date')
plt.ylabel('Attribute Value')
plt.legend(loc='upper right', title="Legend")
plt.grid()
plt.tight_layout()
plt.show()

# Plot Tempo
plt.figure(figsize=(12, 6))
plt.plot(grouped_data.index, grouped_data['Tempo'], marker='o', label='Tempo (US)', linestyle='-')
plt.title('Trends in Tempo Over Time')
plt.xlabel('Date')
plt.ylabel('Attribute Value')
plt.legend(loc='upper right', title="Legend")
plt.grid()
plt.tight_layout()
plt.show()

# Plot Loudness and Acousticness
plt.figure(figsize=(12, 6))
plt.plot(grouped_data.index, grouped_data['Loudness (dB)'], marker='s', label='Loudness (dB) (US)', linestyle='-')
plt.plot(grouped_data.index, grouped_data['Acousticness'], marker='d', label='Acousticness (US)', linestyle='-')
plt.title('Trends in Loudness and Acousticness Over Time')
plt.xlabel('Date')
plt.ylabel('Attribute Value')
plt.legend(loc='upper right', title="Legend")
plt.grid()
plt.tight_layout()
plt.show()

# Importing libraries
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from plotnine import *
import plotnine as p9  # For visualizations similar to ggplot
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, classification_report

# Calculate the likelihood of Track Name staying the same for x number of days
track_likelihood = Top_50_US_Merged_Data.groupby('Track Name')['ISO_Date'].nunique()

# Plot for Track Name
plt.figure(figsize=(12, 6))
plt.hist(track_likelihood, bins=range(1, track_likelihood.max() + 2), align='left', edgecolor='black')
plt.title('Likelihood of Track Name Staying the Same for X Days for US Only')
plt.xlabel('Number of Days')
plt.ylabel('Frequency')
plt.xticks(range(1, track_likelihood.max() + 1))
plt.grid()
plt.show()

# Calculate the likelihood of Artist staying the same for x number of days
artist_likelihood = Top_50_US_Merged_Data.groupby('Artists')['ISO_Date'].nunique()

# Plot for Artist
plt.figure(figsize=(12, 6))
plt.hist(artist_likelihood, bins=range(1, artist_likelihood.max() + 2), align='left', edgecolor='black')
plt.title('Likelihood of Artist Staying the Same for X Days for US Only')
plt.xlabel('Number of Days')
plt.ylabel('Frequency')
plt.xticks(range(1, artist_likelihood.max() + 1))
plt.grid()
plt.show()

# Convert ISO_Date to datetime for both datasets
Top_50_US_Merged_Data['ISO_Date'] = pd.to_datetime(Top_50_US_Merged_Data['ISO_Date'])
Top_50_Mexico_Merged_Data['ISO_Date'] = pd.to_datetime(Top_50_Mexico_Merged_Data['ISO_Date'])

# Group by date and calculate mean values for numerical attributes
numeric_columns_us = Top_50_US_Merged_Data.select_dtypes(include=['number']).columns
numeric_columns_mexico = Top_50_Mexico_Merged_Data.select_dtypes(include=['number']).columns

grouped_data_us = Top_50_US_Merged_Data.groupby('ISO_Date')[numeric_columns_us].mean()
grouped_data_mexico = Top_50_Mexico_Merged_Data.groupby('ISO_Date')[numeric_columns_mexico].mean()

# Plot Danceability, Energy, and Valence (US vs. Mexico)
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(grouped_data_us.index, grouped_data_us['Danceability'], marker='o', label='Danceability (US)', linestyle='-')
ax.plot(grouped_data_mexico.index, grouped_data_mexico['Danceability'], marker='o', label='Danceability (Mexico)', linestyle='--')
ax.plot(grouped_data_us.index, grouped_data_us['Energy'], marker='s', label='Energy (US)', linestyle='-')
ax.plot(grouped_data_mexico.index, grouped_data_mexico['Energy'], marker='s', label='Energy (Mexico)', linestyle='--')
ax.plot(grouped_data_us.index, grouped_data_us['Valence'], marker='^', label='Valence (US)', linestyle='-')
ax.plot(grouped_data_mexico.index, grouped_data_mexico['Valence'], marker='^', label='Valence (Mexico)', linestyle='--')
ax.set_title('Trends in Danceability, Energy, and Valence Over Time (US vs. Mexico)')
ax.set_xlabel('Date')
ax.set_ylabel('Attribute Value')
ax.grid()
fig.legend(
    loc='upper right', bbox_to_anchor=(0.95, 0.95), title="Legend"
)
plt.tight_layout()
plt.show()

# Plot Tempo (US vs. Mexico)
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(grouped_data_us.index, grouped_data_us['Tempo'], marker='o', label='Tempo (US)', linestyle='-')
ax.plot(grouped_data_mexico.index, grouped_data_mexico['Tempo'], marker='o', label='Tempo (Mexico)', linestyle='--')
ax.set_title('Trends in Tempo Over Time (US vs. Mexico)')
ax.set_xlabel('Date')
ax.set_ylabel('Attribute Value')
ax.grid()
fig.legend(
    loc='upper right', bbox_to_anchor=(0.95, 0.95), title="Legend"
)
plt.tight_layout()
plt.show()

# Plot Acousticness and Loudness (US vs. Mexico)
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(grouped_data_us.index, grouped_data_us['Acousticness'], marker='d', label='Acousticness (US)', linestyle='-')
ax.plot(grouped_data_mexico.index, grouped_data_mexico['Acousticness'], marker='d', label='Acousticness (Mexico)', linestyle='--')
ax.plot(grouped_data_us.index, grouped_data_us['Loudness (dB)'], marker='s', label='Loudness (dB) (US)', linestyle='-')
ax.plot(grouped_data_mexico.index, grouped_data_mexico['Loudness (dB)'], marker='s', label='Loudness (dB) (Mexico)', linestyle='--')
ax.set_title('Trends in Acousticness and Loudness Over Time (US vs. Mexico)')
ax.set_xlabel('Date')
ax.set_ylabel('Attribute Value')
ax.grid()
fig.legend(
    loc='upper right', bbox_to_anchor=(0.95, 0.95), title="Legend"
)
plt.tight_layout()
plt.show()

# Calculate the likelihood of Track Name staying the same for x number of days for US and Mexico
track_likelihood_us = Top_50_US_Merged_Data.groupby('Track Name')['ISO_Date'].nunique()
track_likelihood_mexico = Top_50_Mexico_Merged_Data.groupby('Track Name')['ISO_Date'].nunique()

# Plot for Track Name (US vs. Mexico)
plt.figure(figsize=(12, 6))
plt.hist(track_likelihood_us, bins=range(1, track_likelihood_us.max() + 2), alpha=0.6, label='US', color='blue', edgecolor='black')
plt.hist(track_likelihood_mexico, bins=range(1, track_likelihood_mexico.max() + 2), alpha=0.6, label='Mexico', color='orange', edgecolor='black')
plt.title('Likelihood of Track Name Staying the Same for X Days (US vs. Mexico)')
plt.xlabel('Number of Days')
plt.ylabel('Frequency')
plt.xticks(range(1, max(track_likelihood_us.max(), track_likelihood_mexico.max()) + 1))
plt.legend()
plt.grid()
plt.show()

# Calculate the likelihood of Artist staying the same for x number of days for US and Mexico
artist_likelihood_us = Top_50_US_Merged_Data.groupby('Artists')['ISO_Date'].nunique()
artist_likelihood_mexico = Top_50_Mexico_Merged_Data.groupby('Artists')['ISO_Date'].nunique()

# Plot for Artist (US vs. Mexico)
plt.figure(figsize=(12, 6))
plt.hist(artist_likelihood_us, bins=range(1, artist_likelihood_us.max() + 2), alpha=0.6, label='US', color='green', edgecolor='black')
plt.hist(artist_likelihood_mexico, bins=range(1, artist_likelihood_mexico.max() + 2), alpha=0.6, label='Mexico', color='red', edgecolor='black')
plt.title('Likelihood of Artist Staying the Same for X Days (US vs. Mexico)')
plt.xlabel('Number of Days')
plt.ylabel('Frequency')
plt.xticks(range(1, max(artist_likelihood_us.max(), artist_likelihood_mexico.max()) + 1))
plt.legend()
plt.grid()
plt.show()

# Importing libraries
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from plotnine import *
import plotnine as p9  # For visualizations similar to ggplot
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, classification_report

# Split into 75% training and 25% validation sets, ensuring randomness
Top_50_US_Merged_Training_Data, Top_50_US_Merged_Val_Data = train_test_split(
    Top_50_US_Merged_Data, 
    test_size=0.25, 
    random_state=42  # Ensures reproducibility
)

# Save the splits to new CSV files
Top_50_US_Merged_Training_Data.to_csv('Top_50_US_Merged_Training_Data.csv', index=False)
Top_50_US_Merged_Val_Data.to_csv('Top_50_US_Merged_Val_Data.csv', index=False)

# Output confirmation
print("Training and validation datasets saved as 'Top_50_US_Merged_Training_Data.csv' and 'Top_50_US_Merged_Val_Data.csv'.")

# Print the number of rows in each split
Train_Row_Count = Top_50_US_Merged_Training_Data.shape[0]
Val_Row_Count = Top_50_US_Merged_Val_Data.shape[0]

print(Train_Row_Count, "rows in training dataset")
print(Val_Row_Count, "rows in validation dataset")

Training and validation datasets saved as 'Top_50_US_Merged_Training_Data.csv' and 'Top_50_US_Merged_Val_Data.csv'.
1087 rows in training dataset
363 rows in validation dataset

# # Explore the dataset ----------------------------------------------------------------
# print(Top_50_US_Merged_Training_Data.info())
# print(Top_50_US_Merged_Training_Data.describe())

# # Identify unique values in important columns
# print("Unique values in 'Artists':", Top_50_US_Merged_Training_Data['Artists'].nunique())
# print("Unique values in 'Track Name':", Top_50_US_Merged_Training_Data['Track Name'].nunique())

# Add a binary column 'in_top_50' to label all songs as being in the Top 50
Top_50_US_Merged_Training_Data['in_top_50'] = 1
Top_50_US_Merged_Val_Data['in_top_50'] = 1

# Define the features to be used for the logistic regression model
features = ['Danceability', 'Energy', 'Tempo', 'Valence', 'Acousticness', 'Liveness', 'Loudness (dB)']

# Extract X (features) and y (target) for training and validation datasets
X_train = Top_50_US_Merged_Training_Data[features]
y_train = Top_50_US_Merged_Training_Data['in_top_50']

X_val = Top_50_US_Merged_Val_Data[features]
y_val = Top_50_US_Merged_Val_Data['in_top_50']

# Print the shape of the training and validation datasets for verification
print(f"Training Features Shape: {X_train.shape}, Training Target Shape: {y_train.shape}")
print(f"Validation Features Shape: {X_val.shape}, Validation Target Shape: {y_val.shape}")

# Define realistic ranges for features to generate synthetic non-Top 50 data
realistic_ranges = {
    'Danceability': (0.3, 0.7),
    'Energy': (0.2, 0.6),
    'Tempo': (70, 120),
    'Valence': (0.2, 0.6),
    'Acousticness': (0.5, 1.0),
    'Liveness': (0.1, 0.4),
    'Loudness (dB)': (-30, -10)
}

# Generate synthetic non-Top 50 songs for training data
num_synthetic_train_samples = len(Top_50_US_Merged_Training_Data)
synthetic_train_samples = pd.DataFrame({
    feature: np.random.uniform(
        low=realistic_ranges[feature][0],
        high=realistic_ranges[feature][1],
        size=num_synthetic_train_samples
    )
    for feature in realistic_ranges.keys()
})

# Label synthetic training samples as not in Top 50
synthetic_train_samples['in_top_50'] = 0

# Combine Top 50 training songs with synthetic non-Top 50 songs to create a balanced training dataset
balanced_train_data = pd.concat([Top_50_US_Merged_Training_Data, synthetic_train_samples], ignore_index=True)

# Separate features and target for the balanced training dataset
X_train = balanced_train_data[features]
y_train = balanced_train_data['in_top_50']

# Train a logistic regression model on the balanced training dataset
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Generate synthetic non-Top 50 songs for validation data
num_synthetic_val_samples = len(Top_50_US_Merged_Val_Data)
synthetic_val_samples = pd.DataFrame({
    feature: np.random.uniform(
        low=realistic_ranges[feature][0],
        high=realistic_ranges[feature][1],
        size=num_synthetic_val_samples
    )
    for feature in realistic_ranges.keys()
})

# Label synthetic validation samples as not in Top 50
synthetic_val_samples['in_top_50'] = 0

# Combine Top 50 validation songs with synthetic non-Top 50 songs
balanced_val_data = pd.concat([Top_50_US_Merged_Val_Data, synthetic_val_samples], ignore_index=True)

# Separate features and target for the balanced validation dataset
X_balanced_val = balanced_val_data[features]
y_balanced_val = balanced_val_data['in_top_50']

# Predict on the balanced validation set using the trained model
y_pred_balanced = model.predict(X_balanced_val)

# Evaluate the model's performance on the balanced validation set
print("\nClassification Report on Balanced Validation Set:")
print(classification_report(y_balanced_val, y_pred_balanced))

print("\nConfusion Matrix on Balanced Validation Set:")
print(confusion_matrix(y_balanced_val, y_pred_balanced))

Training Features Shape: (1087, 7), Training Target Shape: (1087,)
Validation Features Shape: (363, 7), Validation Target Shape: (363,)

Classification Report on Balanced Validation Set:
              precision    recall  f1-score   support

           0       0.99      0.96      0.98       363
           1       0.97      0.99      0.98       363

    accuracy                           0.98       726
   macro avg       0.98      0.98      0.98       726
weighted avg       0.98      0.98      0.98       726


Confusion Matrix on Balanced Validation Set:
[[350  13]
 [  2 361]]

import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
import pandas as pd

# Evaluate the model's performance on the balanced validation set
overall_accuracy = model.score(X_balanced_val, y_balanced_val)
classification_metrics = classification_report(y_balanced_val, y_pred_balanced, output_dict=True)

# Confusion matrix as a DataFrame
conf_matrix = pd.crosstab(y_balanced_val, y_pred_balanced, rownames=['Actual'], colnames=['Predicted'])

# Confusion matrix visualization with interpretation embedded
fig, ax = plt.subplots(figsize=(8, 8))
disp = ConfusionMatrixDisplay.from_predictions(
    y_balanced_val, y_pred_balanced, ax=ax, cmap='Blues'
)
plt.title("Confusion Matrix: Balanced Validation Set")
plt.grid(False)

# Add interpretation as text slightly below the confusion matrix
interpretation = (
    f"Interpretation:\n"
    f" - True Negatives (Class 0 correctly predicted): {conf_matrix.loc[0, 0]}\n"
    f" - False Positives (Class 0 incorrectly predicted as Class 1): {conf_matrix.loc[0, 1]}\n"
    f" - False Negatives (Class 1 incorrectly predicted as Class 0): {conf_matrix.loc[1, 0]}\n"
    f" - True Positives (Class 1 correctly predicted): {conf_matrix.loc[1, 1]}"
)
plt.gcf().text(0.1, 0.02, interpretation, fontsize=10, wrap=True, horizontalalignment='left')
plt.tight_layout(rect=[0, 0.1, 1, 1])  # Adjust layout to provide space for interpretation text
plt.show()

# Convert classification report to DataFrame for plotting
df_report = pd.DataFrame(classification_metrics).transpose()

# Plot Precision, Recall, F1-Score for each class
fig, ax = plt.subplots(figsize=(10, 6))
df_report.loc[['0', '1'], ['precision', 'recall', 'f1-score']].plot.bar(
    ax=ax, rot=0, color=['skyblue', 'orange', 'green']
)
plt.title("Classification Metrics by Class")
plt.ylabel("Score")
plt.xlabel("Class (0 = Non-Top 50, 1 = Top 50)")
plt.ylim(0, 1.1)
plt.legend(loc="lower right", title="Metric")
plt.grid(axis='y')

# Add definitions of metrics below the chart
definitions = (
    "Definitions:\n"
    " - Precision: Proportion of correct positive predictions.\n"
    " - Recall: Proportion of actual positives correctly predicted.\n"
    " - F1-Score: Harmonic mean of precision and recall."
)
plt.figtext(0.1, -0.1, definitions, wrap=True, horizontalalignment='left', fontsize=10)
plt.tight_layout()
plt.show()

import matplotlib.pyplot as plt

# Convert ISO_Date to datetime
Top_50_US_Merged_Data['ISO_Date'] = pd.to_datetime(Top_50_US_Merged_Data['ISO_Date'])

# Identify non-numeric columns
non_numeric_columns = Top_50_US_Merged_Data.select_dtypes(exclude=['number']).columns
print("Non-numeric columns:", non_numeric_columns)

# Group by date and calculate mean values for numerical attributes (actual data)
numeric_columns = Top_50_US_Merged_Data.select_dtypes(include=['number']).columns
grouped_data_actuals = Top_50_US_Merged_Data.groupby('ISO_Date')[numeric_columns].mean()

# Ensure predictions are made only on the original Top 50 training data
X_train_original = Top_50_US_Merged_Training_Data[features]  # Features from the original training data
predicted_values = model.predict(X_train_original)  # Predictions for the original training dataset

# Ensure predictions are mapped to the original dataset
predicted_data = Top_50_US_Merged_Training_Data.copy()  # Original training data
predicted_data['Prediction'] = predicted_values  # Add predictions column

# Group predictions by date (ensuring ISO_Date exists in the training dataset)
grouped_data_predictions = (
    predicted_data.groupby('ISO_Date')[features].mean()
)

# Plot Danceability, Energy, and Valence with overlay
plt.figure(figsize=(12, 6))
plt.plot(
    grouped_data_actuals.index,
    grouped_data_actuals['Danceability'],
    marker='o',
    label='Danceability (Actual)',
    linestyle='-',
    color='navy'
)
plt.plot(
    grouped_data_predictions.index,
    grouped_data_predictions['Danceability'],
    marker='o',
    label='Danceability (Predicted)',
    linestyle='--',
    color='royalblue'
)
plt.plot(
    grouped_data_actuals.index,
    grouped_data_actuals['Energy'],
    marker='s',
    label='Energy (Actual)',
    linestyle='-',
    color='green'
)
plt.plot(
    grouped_data_predictions.index,
    grouped_data_predictions['Energy'],
    marker='s',
    label='Energy (Predicted)',
    linestyle='--',
    color='lightgreen'
)
plt.plot(
    grouped_data_actuals.index,
    grouped_data_actuals['Valence'],
    marker='^',
    label='Valence (Actual)',
    linestyle='-',
    color='orange'
)
plt.plot(
    grouped_data_predictions.index,
    grouped_data_predictions['Valence'],
    marker='^',
    label='Valence (Predicted)',
    linestyle='--',
    color='gold'
)
plt.title('Trends in Song Attributes Over Time (Actual vs. Predicted)')
plt.xlabel('Date')
plt.ylabel('Attribute Value')
plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1))
plt.grid()
plt.tight_layout()
plt.show()

# Plot Tempo with overlay
plt.figure(figsize=(12, 6))
plt.plot(
    grouped_data_actuals.index,
    grouped_data_actuals['Tempo'],
    marker='o',
    label='Tempo (Actual)',
    linestyle='-',
    color='navy'
)
plt.plot(
    grouped_data_predictions.index,
    grouped_data_predictions['Tempo'],
    marker='o',
    label='Tempo (Predicted)',
    linestyle='--',
    color='royalblue'
)
plt.title('Trends in Tempo Over Time (Actual vs. Predicted)')
plt.xlabel('Date')
plt.ylabel('Attribute Value')
plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1))
plt.grid()
plt.tight_layout()
plt.show()

# Plot Loudness and Acousticness with overlay
plt.figure(figsize=(12, 6))
plt.plot(
    grouped_data_actuals.index,
    grouped_data_actuals['Loudness (dB)'],
    marker='s',
    label='Loudness (dB) (Actual)',
    linestyle='-',
    color='green'
)
plt.plot(
    grouped_data_predictions.index,
    grouped_data_predictions['Loudness (dB)'],
    marker='s',
    label='Loudness (dB) (Predicted)',
    linestyle='--',
    color='lightgreen'
)
plt.plot(
    grouped_data_actuals.index,
    grouped_data_actuals['Acousticness'],
    marker='d',
    label='Acousticness (Actual)',
    linestyle='-',
    color='orange'
)
plt.plot(
    grouped_data_predictions.index,
    grouped_data_predictions['Acousticness'],
    marker='d',
    label='Acousticness (Predicted)',
    linestyle='--',
    color='gold'
)
plt.title('Trends in Loudness and Acousticness Over Time (Actual vs. Predicted)')
plt.xlabel('Date')
plt.ylabel('Attribute Value')
plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1))
plt.grid()
plt.tight_layout()
plt.show()

Non-numeric columns: Index(['Track Name', 'Artists', 'ISO_Date'], dtype='object')

Before we jump into predictive modeling, let's first understand the data using groupby and visualization.¶

Summary of the Results:¶

Key Insights:¶

Now let's start with the predictive modeling :)¶

Business Case¶

Spotify has deprecated some of their functionality and thus we are unable to pull non-top 50 data. This is document at the websites below:¶

Plot Summary¶

Why This Visualization is Useful¶

Key Takeaways¶

End of our Predictive Modeling section and presentation. Thank you for your attention and we welcome any questions or feedback!¶